Projekt był dla mnie wymagający i ciekawy, ponieważ pozwolił mi zmierzyć się z analizą tak dużych danych. Najwięcej problemów przystworzyło mi ciągłe dostosowywanie danych tak, aby wystarczyło mi pamięci operacyjnej. Wykonałem większość z postawionych zadań, ale miałem też problemy mniejsze lub większe. Klasyfikator - najtrudniejsza była dla mnie budowa klasyfikatora i niestety do tej pory nie działa on najlepiej, ponieważ daje wyniki tylko na małej ilości danych, a nie na całym zbiorze. Rozkład wartości kolumn zaczynających się od part01_ - uważam, że przedstawienie wszystkich 106 kolumn na jednym wykresie nie jest zbyt optymalne, ale gdy chciałem stworzyć osobny dla każdego miałem błędy kompilacji. Reszta zadań - z wykonania reszty zadań jestem zadowolony.
library(dplyr)
library(ggplot2)
library(plotly)
library(tidyr)
library(knitr)
library(caret)
library(data.table)
d <- fread("~/all_summary.csv", header = TRUE, sep = ";", quote = "\"", drop = c("title", "pbd_code", "res_id", "chain_id", "local_BAa", "local_NPa", "local_Ra", "local_RGa", "local_SRGa", "local_CCSa", "local_CCPa", "local_ZOa", "local_ZDa", "local_ZD_minus_a", "local_ZD_plus_a", "local_res_atom_count", "local_res_atom_non_h_occupancy_sum", "local_res_atom_non_h_electron_occupancy_sum", "local_res_atom_C_count", "local_res_atom_N_count", "local_res_atom_O_count", "local_res_atom_S_count", "dict_atom_C_count", "dict_atom_N_count", "dict_atom_O_count", "dict_atom_S_count","fo_col", "fc_col", "weight_col", "grid_space", "solvent_radius", "solvent_opening_radius", "part_step_FoFc_std_min", "part_step_FoFc_std_max", "part_step_FoFc_std_step"))
dane <- d %>%
filter(res_name != c("UNK", "UNX", "UNL", "DUM", "N", "BLOB", "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "MSE", "PHE", "PRO", "SEC", "SER", "THR", "TRP", "TYR", "VAL", "DA", "DG", "DT", "DC", "DU", "A", "G", "T", "C", "U", "HOH", "H20", "WAT"))
test <- dane %>% drop_na()
dim(test)
## [1] 529581 389
str(c(test$res_name, test$blob_coverage, test$blob_volume_coverage, test$local_res_atom_non_h_count, test$local_res_atom_non_h_electron_sum))
## chr [1:2647905] "SF4" "XCC" "SF4" "XCC" "SF4" "SF4" "SF4" "SX" "SO4" ...
test1 <- test %>% group_by(res_name) %>% summarize(ile = n()) %>% arrange(desc(ile)) %>% head(50)
top50 <- test %>% filter(res_name == test1$res_name)
k <- top50 %>% select_if(is.numeric)
kor <- k[complete.cases(k),]
kor_round <- round(cor(kor),2)
## Warning in cor(kor): odchylenie standardowe wynosi zero
kor_melt <- melt(kor_round) %>% arrange(value)
kor_gg <- ggplot(kor_melt, aes(Var1, Var2, fill = value)) + geom_tile() +
scale_fill_gradient(low = "red", high = "blue") +
theme(axis.text.x = element_blank(), axis.text.y= element_blank())
kor_gg
ile <- top50 %>% group_by(res_name) %>% summarize(ile = n()) %>% arrange(desc(ile))
ile
## # A tibble: 50 x 2
## res_name ile
## <chr> <int>
## 1 SO4 1119
## 2 GOL 775
## 3 EDO 541
## 4 CA 445
## 5 CL 444
## 6 NAG 444
## 7 ZN 366
## 8 MG 252
## 9 PO4 214
## 10 HEM 190
## # ... with 40 more rows
ga <- ggplot(top50, aes(local_res_atom_non_h_count)) + geom_density() + ggtitle("Atomy")
ge <- ggplot(top50, aes(local_res_atom_non_h_electron_sum)) + geom_density() + ggtitle("Elektrony")
ga1 <- ggplotly(ga)
ge1 <-ggplotly(ge)
ga1
ge1
top50 %>%
select(res_name, local_res_atom_non_h_count, dict_atom_non_h_count) %>%
group_by(res_name) %>%
summarise(niezgodnosc = mean(abs(local_res_atom_non_h_count - dict_atom_non_h_count))) %>%
arrange(-niezgodnosc) %>%
head(10)
## # A tibble: 10 x 2
## res_name niezgodnosc
## <chr> <dbl>
## 1 1PE 2.88
## 2 COA 2.46
## 3 CLA 2.35
## 4 MLY 1.23
## 5 NAP 1.11
## 6 NAG 0.977
## 7 SEP 0.971
## 8 NDP 0.95
## 9 PLP 0.92
## 10 MAN 0.814
top50 %>%
select(res_name, local_res_atom_non_h_electron_sum, dict_atom_non_h_electron_sum) %>%
group_by(res_name) %>%
summarise(niezgodnosc = mean(abs(local_res_atom_non_h_electron_sum - dict_atom_non_h_electron_sum))) %>%
arrange(-niezgodnosc) %>%
head(10)
## # A tibble: 10 x 2
## res_name niezgodnosc
## <chr> <dbl>
## 1 1PE 19.6
## 2 COA 18.1
## 3 CLA 14.1
## 4 MLY 9.39
## 5 NAG 7.82
## 6 SEP 7.76
## 7 NAP 7.47
## 8 PLP 7.28
## 9 NDP 6.58
## 10 MAN 6.51
dim(top50)
## [1] 6985 389
part01 <- top50 %>% select(starts_with("part_01")) %>% gather(nazwa, wartosc, 1:106)
ggplot(part01, aes(nazwa, wartosc)) + geom_boxplot() + theme(axis.text.x = element_blank())
data_top50 <- top50 %>% select_if(is.numeric)
set.seed(23)
partition <- createDataPartition(
y = data_top50$local_res_atom_non_h_electron_sum,
p = .7,
list = FALSE)
data_train <- data_top50 %>% slice(partition)
data_test <- data_top50 %>% slice(-partition)
dim(data_train)
## [1] 4891 384
dim(data_test)
## [1] 2094 384
set.seed(23)
fit <- train(local_res_atom_non_h_electron_sum ~ ., data = data_train, method = "lm")
fit
## Linear Regression
##
## 4891 samples
## 383 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 4891, 4891, 4891, 4891, 4891, 4891, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 5.761466 0.9916907 0.5280199
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
set.seed(23)
prediction <- predict(fit, newdata = data_test)
postResample(pred = prediction, obs = data_test$local_res_atom_non_h_electron_sum)
## RMSE Rsquared MAE
## 2.2795767 0.9993083 0.3132288
data_top50a <- top50 %>% select_if(is.numeric)
set.seed(23)
partition <- createDataPartition(
y = data_top50a$local_res_atom_non_h_count,
p = .7,
list = FALSE)
data_train_a <- data_top50a %>% slice(partition)
data_test_a <- data_top50a %>% slice(-partition)
dim(data_train_a)
## [1] 4891 384
dim(data_test_a)
## [1] 2094 384
set.seed(23)
fit <- train(local_res_atom_non_h_count ~ ., data = data_train_a, method = "lm")
fit
## Linear Regression
##
## 4891 samples
## 383 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 4891, 4891, 4891, 4891, 4891, 4891, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.6641111 0.9940383 0.06723555
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
set.seed(23)
prediction <- predict(fit, newdata = data_test_a)
postResample(pred = prediction, obs = data_test_a$local_res_atom_non_h_count)
## RMSE Rsquared MAE
## 0.31732194 0.99937828 0.04707851
# usun <- c("blob_coverage", "res_coverage", "local_res_atom_non_h_count", "local_res_atom_non_h_electron_sum", "dict_atom_non_h_count", "dict_atom_non_h_electron_sum")
#
# data_top50_k <- top50 %>% select(-usun)
#
# data_top50_k$res_name <- as.factor(data_top50_k$res_name)
#
# set.seed(23)
# partition <- createDataPartition(
# y = data_top50_k$res_name,
# p = .7,
# list = FALSE)
# data_train <- data_top50_k %>%
# slice(partition)
# data_test <- data_top50_k %>%
# slice(-partition)
# dim(data_train)
# dim(data_test)
#
# set.seed(23)
# fit <- train(
# res_name ~ .,
# data = data_train,
# method = "rf",
# ntree = 10,
# na.action = na.pass)
# fit
#
# set.seed(23)
# prediction <- predict(fit, newdata = data_test)
# confusionMatrix(data = prediction, data_test$res_name)